comments = spark.read.parquet("abfss://anly502@marckvaismanblob.dfs.core.windows.net/reddit/parquet/comments")
submissions = spark.read.parquet("abfss://anly502@marckvaismanblob.dfs.core.windows.net/reddit/parquet/submissions")
## create a directory called data/plots and data/csv to save generated data
import os
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
PLOT_DIR = os.path.join("data", "plots")
CSV_DIR = os.path.join("data", "csv")
os.makedirs(PLOT_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
# Submission Data
submissions.printSchema();
root |-- adserver_click_url: string (nullable = true) |-- adserver_imp_pixel: string (nullable = true) |-- archived: boolean (nullable = true) |-- author: string (nullable = true) |-- author_cakeday: boolean (nullable = true) |-- author_flair_css_class: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_id: string (nullable = true) |-- brand_safe: boolean (nullable = true) |-- contest_mode: boolean (nullable = true) |-- created_utc: long (nullable = true) |-- crosspost_parent: string (nullable = true) |-- crosspost_parent_list: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- approved_at_utc: string (nullable = true) | | |-- approved_by: string (nullable = true) | | |-- archived: boolean (nullable = true) | | |-- author: string (nullable = true) | | |-- author_flair_css_class: string (nullable = true) | | |-- author_flair_text: string (nullable = true) | | |-- banned_at_utc: string (nullable = true) | | |-- banned_by: string (nullable = true) | | |-- brand_safe: boolean (nullable = true) | | |-- can_gild: boolean (nullable = true) | | |-- can_mod_post: boolean (nullable = true) | | |-- clicked: boolean (nullable = true) | | |-- contest_mode: boolean (nullable = true) | | |-- created: double (nullable = true) | | |-- created_utc: double (nullable = true) | | |-- distinguished: string (nullable = true) | | |-- domain: string (nullable = true) | | |-- downs: long (nullable = true) | | |-- edited: boolean (nullable = true) | | |-- gilded: long (nullable = true) | | |-- hidden: boolean (nullable = true) | | |-- hide_score: boolean (nullable = true) | | |-- id: string (nullable = true) | | |-- is_crosspostable: boolean (nullable = true) | | |-- is_reddit_media_domain: boolean (nullable = true) | | |-- is_self: boolean (nullable = true) | | |-- is_video: boolean (nullable = true) | | |-- likes: string (nullable = true) | | |-- link_flair_css_class: string (nullable = true) | | |-- link_flair_text: string (nullable = true) | | |-- locked: boolean (nullable = true) | | |-- media: string (nullable = true) | | |-- mod_reports: array (nullable = true) | | | |-- element: string (containsNull = true) | | |-- name: string (nullable = true) | | |-- num_comments: long (nullable = true) | | |-- num_crossposts: long (nullable = true) | | |-- num_reports: string (nullable = true) | | |-- over_18: boolean (nullable = true) | | |-- parent_whitelist_status: string (nullable = true) | | |-- permalink: string (nullable = true) | | |-- pinned: boolean (nullable = true) | | |-- quarantine: boolean (nullable = true) | | |-- removal_reason: string (nullable = true) | | |-- report_reasons: string (nullable = true) | | |-- saved: boolean (nullable = true) | | |-- score: long (nullable = true) | | |-- secure_media: string (nullable = true) | | |-- selftext: string (nullable = true) | | |-- selftext_html: string (nullable = true) | | |-- spoiler: boolean (nullable = true) | | |-- stickied: boolean (nullable = true) | | |-- subreddit: string (nullable = true) | | |-- subreddit_id: string (nullable = true) | | |-- subreddit_name_prefixed: string (nullable = true) | | |-- subreddit_type: string (nullable = true) | | |-- suggested_sort: string (nullable = true) | | |-- thumbnail: string (nullable = true) | | |-- thumbnail_height: string (nullable = true) | | |-- thumbnail_width: string (nullable = true) | | |-- title: string (nullable = true) | | |-- ups: long (nullable = true) | | |-- url: string (nullable = true) | | |-- user_reports: array (nullable = true) | | | |-- element: string (containsNull = true) | | |-- view_count: string (nullable = true) | | |-- visited: boolean (nullable = true) | | |-- whitelist_status: string (nullable = true) |-- disable_comments: boolean (nullable = true) |-- distinguished: string (nullable = true) |-- domain: string (nullable = true) |-- domain_override: string (nullable = true) |-- edited: string (nullable = true) |-- embed_type: string (nullable = true) |-- embed_url: string (nullable = true) |-- gilded: long (nullable = true) |-- hidden: boolean (nullable = true) |-- hide_score: boolean (nullable = true) |-- href_url: string (nullable = true) |-- id: string (nullable = true) |-- imp_pixel: string (nullable = true) |-- is_crosspostable: boolean (nullable = true) |-- is_reddit_media_domain: boolean (nullable = true) |-- is_self: boolean (nullable = true) |-- is_video: boolean (nullable = true) |-- link_flair_css_class: string (nullable = true) |-- link_flair_text: string (nullable = true) |-- locked: boolean (nullable = true) |-- media: struct (nullable = true) | |-- event_id: string (nullable = true) | |-- oembed: struct (nullable = true) | | |-- author_name: string (nullable = true) | | |-- author_url: string (nullable = true) | | |-- cache_age: long (nullable = true) | | |-- description: string (nullable = true) | | |-- height: long (nullable = true) | | |-- html: string (nullable = true) | | |-- provider_name: string (nullable = true) | | |-- provider_url: string (nullable = true) | | |-- thumbnail_height: long (nullable = true) | | |-- thumbnail_url: string (nullable = true) | | |-- thumbnail_width: long (nullable = true) | | |-- title: string (nullable = true) | | |-- type: string (nullable = true) | | |-- url: string (nullable = true) | | |-- version: string (nullable = true) | | |-- width: long (nullable = true) | |-- reddit_video: struct (nullable = true) | | |-- dash_url: string (nullable = true) | | |-- duration: long (nullable = true) | | |-- fallback_url: string (nullable = true) | | |-- height: long (nullable = true) | | |-- hls_url: string (nullable = true) | | |-- is_gif: boolean (nullable = true) | | |-- scrubber_media_url: string (nullable = true) | | |-- transcoding_status: string (nullable = true) | | |-- width: long (nullable = true) | |-- type: string (nullable = true) |-- media_embed: struct (nullable = true) | |-- content: string (nullable = true) | |-- height: long (nullable = true) | |-- scrolling: boolean (nullable = true) | |-- width: long (nullable = true) |-- mobile_ad_url: string (nullable = true) |-- num_comments: long (nullable = true) |-- num_crossposts: long (nullable = true) |-- original_link: string (nullable = true) |-- over_18: boolean (nullable = true) |-- parent_whitelist_status: string (nullable = true) |-- permalink: string (nullable = true) |-- pinned: boolean (nullable = true) |-- post_hint: string (nullable = true) |-- preview: struct (nullable = true) | |-- enabled: boolean (nullable = true) | |-- images: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- id: string (nullable = true) | | | |-- resolutions: array (nullable = true) | | | | |-- element: struct (containsNull = true) | | | | | |-- height: long (nullable = true) | | | | | |-- url: string (nullable = true) | | | | | |-- width: long (nullable = true) | | | |-- source: struct (nullable = true) | | | | |-- height: long (nullable = true) | | | | |-- url: string (nullable = true) | | | | |-- width: long (nullable = true) | | | |-- variants: struct (nullable = true) | | | | |-- gif: struct (nullable = true) | | | | | |-- resolutions: array (nullable = true) | | | | | | |-- element: struct (containsNull = true) | | | | | | | |-- height: long (nullable = true) | | | | | | | |-- url: string (nullable = true) | | | | | | | |-- width: long (nullable = true) | | | | | |-- source: struct (nullable = true) | | | | | | |-- height: long (nullable = true) | | | | | | |-- url: string (nullable = true) | | | | | | |-- width: long (nullable = true) | | | | |-- mp4: struct (nullable = true) | | | | | |-- resolutions: array (nullable = true) | | | | | | |-- element: struct (containsNull = true) | | | | | | | |-- height: long (nullable = true) | | | | | | | |-- url: string (nullable = true) | | | | | | | |-- width: long (nullable = true) | | | | | |-- source: struct (nullable = true) | | | | | | |-- height: long (nullable = true) | | | | | | |-- url: string (nullable = true) | | | | | | |-- width: long (nullable = true) | | | | |-- nsfw: struct (nullable = true) | | | | | |-- resolutions: array (nullable = true) | | | | | | |-- element: struct (containsNull = true) | | | | | | | |-- height: long (nullable = true) | | | | | | | |-- url: string (nullable = true) | | | | | | | |-- width: long (nullable = true) | | | | | |-- source: struct (nullable = true) | | | | | | |-- height: long (nullable = true) | | | | | | |-- url: string (nullable = true) | | | | | | |-- width: long (nullable = true) | | | | |-- obfuscated: struct (nullable = true) | | | | | |-- resolutions: array (nullable = true) | | | | | | |-- element: struct (containsNull = true) | | | | | | | |-- height: long (nullable = true) | | | | | | | |-- url: string (nullable = true) | | | | | | | |-- width: long (nullable = true) | | | | | |-- source: struct (nullable = true) | | | | | | |-- height: long (nullable = true) | | | | | | |-- url: string (nullable = true) | | | | | | |-- width: long (nullable = true) |-- promoted: boolean (nullable = true) |-- promoted_by: string (nullable = true) |-- promoted_display_name: string (nullable = true) |-- promoted_url: string (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- secure_media: struct (nullable = true) | |-- event_id: string (nullable = true) | |-- oembed: struct (nullable = true) | | |-- author_name: string (nullable = true) | | |-- author_url: string (nullable = true) | | |-- cache_age: long (nullable = true) | | |-- description: string (nullable = true) | | |-- height: long (nullable = true) | | |-- html: string (nullable = true) | | |-- provider_name: string (nullable = true) | | |-- provider_url: string (nullable = true) | | |-- thumbnail_height: long (nullable = true) | | |-- thumbnail_url: string (nullable = true) | | |-- thumbnail_width: long (nullable = true) | | |-- title: string (nullable = true) | | |-- type: string (nullable = true) | | |-- url: string (nullable = true) | | |-- version: string (nullable = true) | | |-- width: long (nullable = true) | |-- type: string (nullable = true) |-- secure_media_embed: struct (nullable = true) | |-- content: string (nullable = true) | |-- height: long (nullable = true) | |-- media_domain_url: string (nullable = true) | |-- scrolling: boolean (nullable = true) | |-- width: long (nullable = true) |-- selftext: string (nullable = true) |-- spoiler: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit: string (nullable = true) |-- subreddit_id: string (nullable = true) |-- suggested_sort: string (nullable = true) |-- third_party_trackers: array (nullable = true) | |-- element: string (containsNull = true) |-- third_party_tracking: string (nullable = true) |-- third_party_tracking_2: string (nullable = true) |-- thumbnail: string (nullable = true) |-- thumbnail_height: long (nullable = true) |-- thumbnail_width: long (nullable = true) |-- title: string (nullable = true) |-- url: string (nullable = true) |-- whitelist_status: string (nullable = true)
print("There're {} columns and {} number of records in Submissions Dataset.".format(len(submissions.columns), submissions.count()))
There're 68 columns and 643255528 number of records in Submissions Dataset.
Some interesting columns for Submissions:
# Comments Data
comments.printSchema()
root |-- author: string (nullable = true) |-- author_cakeday: boolean (nullable = true) |-- author_flair_css_class: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- distinguished: string (nullable = true) |-- edited: string (nullable = true) |-- gilded: long (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit: string (nullable = true) |-- subreddit_id: string (nullable = true)
print("There're {} columns and {} number of records in Comments Dataset.".format(len(comments.columns), comments.count()))
There're 21 columns and 4473556762 number of records in Comments Dataset.
Some interesting columns for Comments:
# Submissions
# Check for missing values
selected_cols = ["subreddit", "title", "num_comments", "locked", "created_utc", "retrieved_on", "edited", "distinguished", "is_self"]
submissions.select([count(when(col(each).isNull(), each)).alias(each) for each in selected_cols]).show()
+---------+-----+------------+------+-----------+------------+------+-------------+-------+ |subreddit|title|num_comments|locked|created_utc|retrieved_on|edited|distinguished|is_self| +---------+-----+------------+------+-----------+------------+------+-------------+-------+ | 0| 0| 0| 0| 0| 199221102|486166| 642396599| 0| +---------+-----+------------+------+-----------+------------+------+-------------+-------+
# Comments
# Check for missing values
selected_cols = ["subreddit", "body", "controversiality", "created_utc", "retrieved_on", "distinguished", "score", "gilded"]
comments.select([count(when(col(each).isNull(), each)).alias(each) for each in selected_cols]).show()
+---------+----+----------------+-----------+------------+-------------+-----+------+ |subreddit|body|controversiality|created_utc|retrieved_on|distinguished|score|gilded| +---------+----+----------------+-----------+------------+-------------+-----+------+ | 0| 0| 0| 0| 1031220301| 4318028560| 0| 0| +---------+----+----------------+-----------+------------+-------------+-----+------+
Don't drop the rows with NULL values for now since we will still need the rows where other feaurtes are not NULL.
Determine the top 15 most popular subreddits topics by the number of comments under that subreddit, so that we can understand the trending topics.
Top 15 submission subreddits by the count of highest number of comments. First group the data set by subreddit and sum up the number of comments for each topic. Conduct plotting on the aggregated table. (Bar Chart)
no_1 = (
submissions.groupBy("subreddit")
.agg(sum("num_comments").alias("sum_num_comments"))
.orderBy(col("sum_num_comments"), ascending=False)
)
no_1.show()
+-------------------+----------------+ | subreddit|sum_num_comments| +-------------------+----------------+ | AskReddit| 122207482| | wallstreetbets| 60240209| | AmItheAsshole| 56035695| | teenagers| 38820799| | FreeKarma4U| 35839326| | memes| 32676424| | politics| 30398313| | CryptoCurrency| 28971559| | Superstonk| 28658546| | nba| 21882597| | worldnews| 20536945| | soccer| 18535819| | antiwork| 18489247| | PublicFreakout| 17138346| | nfl| 16625974| | news| 16181448| | unpopularopinion| 15973493| |relationship_advice| 15366463| | dogecoin| 15074840| | conspiracy| 12974781| +-------------------+----------------+ only showing top 20 rows
no_1_df = spark.createDataFrame(no_1.take(15)).toPandas()
no_1_df
| subreddit | sum_num_comments | |
|---|---|---|
| 0 | AskReddit | 122207482 |
| 1 | wallstreetbets | 60240209 |
| 2 | AmItheAsshole | 56035695 |
| 3 | teenagers | 38820799 |
| 4 | FreeKarma4U | 35839326 |
| 5 | memes | 32676424 |
| 6 | politics | 30398313 |
| 7 | CryptoCurrency | 28971559 |
| 8 | Superstonk | 28658546 |
| 9 | nba | 21882597 |
| 10 | worldnews | 20536945 |
| 11 | soccer | 18535819 |
| 12 | antiwork | 18489247 |
| 13 | PublicFreakout | 17138346 |
| 14 | nfl | 16625974 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.bar("subreddit", "sum_num_comments", data = no_1_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Total Number of Comments")
plt.title("Top 15 most popular subreddits topics by the number of comments under that subreddit")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_1.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_1_df.csv")
no_1_df.to_csv(fpath)
By seeing the popular subreddits that is unlocked, we can keep up to date with the new topics. So determine the top 10 most popular non-locked submission subreddits by submission count.
Top 10 most popular non-locked submission subreddits by submission count. First filter the dataset to exclude the locked subreddits. And group the data set by subreddit then count the number of records in each group. Conduct plotting on the aggregated table. (Bar Chart)
no_2 = (
submissions.filter(col("locked") == "false")
.groupBy("subreddit")
.count()
.orderBy(col("count"), ascending=False)
.collect()
)
no_2_df = spark.createDataFrame(no_2[:10]).toPandas()
no_2_df
| subreddit | count | |
|---|---|---|
| 0 | AskReddit | 6578789 |
| 1 | FreeKarma4U | 4523812 |
| 2 | GaySnapchat | 3135659 |
| 3 | memes | 2695446 |
| 4 | teenagers | 2504530 |
| 5 | jerkbudss | 1856340 |
| 6 | onlyfansgirls101 | 1853240 |
| 7 | AutoNewspaper | 1733583 |
| 8 | wallstreetbets | 1585216 |
| 9 | OnlyFansPromotions | 1512520 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.bar("subreddit", "count", data = no_2_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Submission Count")
plt.title("Top 10 most popular non-locked submission subreddits by submission count")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_2.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_2_df.csv")
no_2_df.to_csv(fpath)
2022 is a year with Covid and quarantine, we want to see the trends people using Reddit by seeing the number of submissions for each month in that year.
Number of submission count by each month in 2022. First create two new variables from "created_utc":
Filter the dataset to only 2022, then group by month and count for each topic. Conduct plotting on the aggregated table. (Line Chart) (New Variable)
submissions_no_3 = submissions.withColumn("created_utc_year", from_unixtime(col("created_utc"), "yyyy")) \
.withColumn("created_utc_month", from_unixtime(col("created_utc"), "MM"))
no_3 = submissions_no_3.filter(col("created_utc_year") == 2022).groupBy("created_utc_month").count().orderBy(col("created_utc_month"), ascending=True).collect()
no_3_df = spark.createDataFrame(no_3).toPandas()
no_3_df
| created_utc_month | count | |
|---|---|---|
| 0 | 01 | 32091070 |
| 1 | 02 | 29843162 |
| 2 | 03 | 32677372 |
| 3 | 04 | 33002461 |
| 4 | 05 | 34838318 |
| 5 | 06 | 34395243 |
| 6 | 07 | 37222497 |
| 7 | 08 | 38178282 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.plot("created_utc_month", "count", data = no_3_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Submission Count")
plt.title("Number of submission count by each month in 2022.")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_3.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_3_df.csv")
no_3_df.to_csv(fpath)
Determine the top 15 number of both distinguished and self-posted submissions by subreditts can help to find the valuable authors.
Number of distinguished and self-posted submission count. First filter the dataset by distinguished (not NULL) and self-posted = true.Then group the data set by month and count for each topic. Conduct plotting on the aggregated table. (Bar Chart)
no_4 = submissions.filter(col("distinguished").isNotNull()).filter(col("is_self") == "true").groupBy("subreddit").count().orderBy(col("count"), ascending=False).collect()
no_4_df = spark.createDataFrame(no_4[:15]).toPandas()
no_4_df
| subreddit | count | |
|---|---|---|
| 0 | MillennialBets | 5147 |
| 1 | UglyBreath | 3904 |
| 2 | StudentCoin_BonusCode | 3188 |
| 3 | kucoin | 3158 |
| 4 | ChildrenOfTheLight | 2881 |
| 5 | hutcoinsales | 2391 |
| 6 | Wallstreetsilver | 2243 |
| 7 | xxfitness | 2166 |
| 8 | Crypto_com | 2114 |
| 9 | churning | 1776 |
| 10 | DestinyTheGame | 1763 |
| 11 | SecondaryInfertility | 1716 |
| 12 | future_fight | 1360 |
| 13 | pub00 | 1359 |
| 14 | pokemontrades | 1312 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.bar("subreddit", "count", data = no_4_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Submission Count")
plt.title("Number of distinguished and self-posted submission count")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_4.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_4_df.csv")
no_4_df.to_csv(fpath)
By seeing the number of submissions edited in the past 15 days, we can see if many people’s first-time submission is inaccurate or always have typos.
Of the submissions are edited, the submission count by the past 15 retrieved dates. First filter the dataset by edited (!= false) and created a new date variable from retrieved date. Then group the data set by date and count for the most recent 5 days. Conduct plotting on the aggregated table. (Line Plot) (New Variable)
submissions_no_5 = submissions.withColumn("retrieved_on_date", from_unixtime(col("retrieved_on"), "yyyy-MM-dd"))
no_5 = submissions_no_5.filter(col("edited")!= "false").groupBy("retrieved_on_date").count().orderBy(col("retrieved_on_date"), ascending=False).collect()
no_5_df = spark.createDataFrame(no_5[:15]).toPandas()
no_5_df
| retrieved_on_date | count | |
|---|---|---|
| 0 | 2022-09-12 | 21547 |
| 1 | 2022-09-10 | 647992 |
| 2 | 2022-08-10 | 20 |
| 3 | 2022-08-09 | 659167 |
| 4 | 2022-07-14 | 68 |
| 5 | 2022-07-07 | 151512 |
| 6 | 2022-07-06 | 466868 |
| 7 | 2022-06-26 | 474439 |
| 8 | 2022-06-25 | 1800189 |
| 9 | 2022-06-24 | 1127572 |
| 10 | 2022-06-02 | 869095 |
| 11 | 2022-06-01 | 961623 |
| 12 | 2022-05-31 | 965549 |
| 13 | 2022-05-30 | 409388 |
| 14 | 2022-02-19 | 6 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.plot("retrieved_on_date", "count", data = no_5_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Comments Count")
plt.title("Number of edited submissions in the 15 most recent days")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_5.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_5_df.csv")
no_5_df.to_csv(fpath)
Determine how many comments under each subreddit regarding covid or quarantine.
Top 10 subreddits count with the word “covid” or “quarantine” in its body text. First create a new dummy variable with boolean values indicating if the records contains “covid” or “quarantine” in the body column using regular expression. Then filter by the new dummy variable and group the data set by subreddits and count for each topic. Conduct plotting on the aggregated table. (Bar Chart) (New Variable) (RegEx)
comments_no_6 = comments.withColumn("is_covid", col("body").rlike("(?i)^*covid|quarantine$")).filter(col("is_covid") == "true")
no_6 = comments_no_6.groupBy("subreddit").count().orderBy(col("count"), ascending=False).collect()
no_6_df = spark.createDataFrame(no_6[:10]).toPandas()
no_6_df
| subreddit | count | |
|---|---|---|
| 0 | conspiracy | 521722 |
| 1 | AskReddit | 454212 |
| 2 | Coronavirus | 331421 |
| 3 | politics | 294707 |
| 4 | news | 286712 |
| 5 | HermanCainAward | 279145 |
| 6 | worldnews | 244494 |
| 7 | CoronavirusDownunder | 186686 |
| 8 | COVID19positive | 159882 |
| 9 | canada | 142054 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.bar("subreddit", "count", data = no_6_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Comments Count")
plt.title("Top 10 subreddits comments count with the word “covid” or “quarantine” in its body text")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_6.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_6_df.csv")
no_6_df.to_csv(fpath)
Determine how many comments are under the top 20 most controversial subreddits.
Top 20 most controversial subreddits by comments count. First filter the controversiality scores (= 1). Then group by the subreddits and count for each topic. Conduct plotting on the aggregated table. (Line Chart)
comments.groupBy("controversiality").count().show()
+----------------+----------+ |controversiality| count| +----------------+----------+ | 0|4392642391| | 1| 80914371| +----------------+----------+
no_7 = comments.filter(col("controversiality") == 1).groupBy("subreddit").count().orderBy(col("count"), ascending=False).collect()
no_7_df = spark.createDataFrame(no_7[:20]).toPandas()
no_7_df
| subreddit | count | |
|---|---|---|
| 0 | worldnews | 1456268 |
| 1 | AskReddit | 1333018 |
| 2 | AmItheAsshole | 1288484 |
| 3 | news | 1093084 |
| 4 | soccer | 992053 |
| 5 | nba | 979803 |
| 6 | conspiracy | 904992 |
| 7 | PublicFreakout | 886600 |
| 8 | politics | 805572 |
| 9 | wallstreetbets | 761370 |
| 10 | canada | 619944 |
| 11 | nfl | 549315 |
| 12 | formula1 | 535431 |
| 13 | memes | 530495 |
| 14 | SquaredCircle | 475753 |
| 15 | pics | 465814 |
| 16 | unpopularopinion | 464471 |
| 17 | relationship_advice | 449025 |
| 18 | europe | 434424 |
| 19 | Conservative | 407054 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.plot("subreddit", "count", data = no_7_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.xticks(rotation = 45)
plt.ylabel("Comments Count")
plt.title("Top 20 most controversial subreddits by comments count")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_7.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_7_df.csv")
no_7_df.to_csv(fpath)
Determine how many comments Reddit retrieved on each month in 2022.
Number of comments Reddit retrieved on each month in 2022. First created a new month variable from the unix timestamp. Then group the dataset by month and count for each month. Conduct plotting on the aggregated table. (Line Chart) (New Variable)
comments_no_8 = comments.withColumn("retrieved_on_year", from_unixtime(col("retrieved_on"), "yyyy")) \
.withColumn("retrieved_on_month", from_unixtime(col("retrieved_on"), "MM"))
no_8 = comments_no_8.filter(col("retrieved_on_year") == 2022).groupBy("retrieved_on_month").count().orderBy(col("retrieved_on_month"), ascending=True).collect()
no_8_df = spark.createDataFrame(no_8).toPandas()
no_8_df
| retrieved_on_month | count | |
|---|---|---|
| 0 | 02 | 592599324 |
| 1 | 03 | 347993643 |
| 2 | 04 | 222053418 |
| 3 | 05 | 365112390 |
| 4 | 06 | 1005849413 |
| 5 | 07 | 213702538 |
| 6 | 08 | 241104667 |
| 7 | 09 | 243424861 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.plot("retrieved_on_month", "count", data = no_8_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Comments Count")
plt.title("Number of comments Reddit retrieved on each month in 2022")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_8.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_8_df.csv")
no_8_df.to_csv(fpath)
Determine the score distributions for all subreddits.
Subreddits score distribution. First bining the scores into buckets and create a dummy variable. Then group the score categories and count comments for each topic. Conduct plotting on the aggregated table. (Bar Chart) (New Variable)
def categorizer(score):
if score < 0:
return "less than 0"
elif(score >= 0) & (score <= 5000):
return "0 and 5000"
elif(score >= 5000) & (score <= 10000):
return "5000 and 10000"
elif(score >= 10000) & (score <= 15000):
return "10000 and 15000"
elif(score >= 15000) & (score <= 20000):
return "15000 and 20000"
elif(score >= 20000) & (score <= 25000):
return "20000 and 25000"
elif(score >= 25000) & (score <= 30000):
return "25000 and 30000"
else:
return "30000+"
bucket_udf = udf(categorizer, StringType())
comments_no_9 = comments.withColumn("score_bucket", bucket_udf("score"))
no_9 = comments_no_9.groupBy("score_bucket").count().orderBy(col("count"), ascending=False).collect()
no_9_df = spark.createDataFrame(no_9).toPandas()
no_9_df
| score_bucket | count | |
|---|---|---|
| 0 | 0 and 5000 | 4321547116 |
| 1 | less than 0 | 151801174 |
| 2 | 5000 and 10000 | 150313 |
| 3 | 10000 and 15000 | 34353 |
| 4 | 15000 and 20000 | 12637 |
| 5 | 20000 and 25000 | 5520 |
| 6 | 30000+ | 2924 |
| 7 | 25000 and 30000 | 2725 |
plt.rcParams['figure.figsize'] = [15, 10]
plt.scatter("score_bucket", "count", data = no_9_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.ylabel("Comments Count")
plt.title("Subreddits with top 10 highest scores by comments count")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_9.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_9_df.csv")
no_9_df.to_csv(fpath)
Determine which subreddits are having the good-quality comments in terms of the “gold” status.
Top 20 subreddits that have the highest number of times its comments received Reddit gold. First group by the subreddits and sum up the “gilded”. Conduct plotting on the aggregated table. (Bar Chart)
no_10 = (
comments.groupBy("subreddit")
.agg(sum("gilded").alias("num_of_times_rated_gold"))
.orderBy(col("num_of_times_rated_gold"), ascending=False)
)
no_10.show()
+-------------------+-----------------------+ | subreddit|num_of_times_rated_gold| +-------------------+-----------------------+ | amcstock| 55301| | AskReddit| 44461| | KGBTR| 32054| | wallstreetbets| 28775| | politics| 19820| | Superstonk| 18291| | AmItheAsshole| 17406| | RealDayTrading| 17393| |nygelleredpathsnark| 16753| | okbuddychicanery| 15195| |InstagramLivesNSFWx| 13218| | Fayeandteddy| 12768| | news| 11946| | worldnews| 11445| | RVVTF| 11045| | PublicFreakout| 10433| | UraniumSqueeze| 10135| | JEENEETards| 9555| | BrieEnloeSnarkk| 8873| | Petroteq| 8238| +-------------------+-----------------------+ only showing top 20 rows
no_10_df = spark.createDataFrame(no_10.take(20)).toPandas()
no_10_df
| subreddit | num_of_times_rated_gold | |
|---|---|---|
| 0 | amcstock | 55301 |
| 1 | AskReddit | 44461 |
| 2 | KGBTR | 32054 |
| 3 | wallstreetbets | 28775 |
| 4 | politics | 19820 |
| 5 | Superstonk | 18291 |
| 6 | AmItheAsshole | 17406 |
| 7 | RealDayTrading | 17393 |
| 8 | nygelleredpathsnark | 16753 |
| 9 | okbuddychicanery | 15195 |
| 10 | InstagramLivesNSFWx | 13218 |
| 11 | Fayeandteddy | 12768 |
| 12 | news | 11946 |
| 13 | worldnews | 11445 |
| 14 | RVVTF | 11045 |
| 15 | PublicFreakout | 10433 |
| 16 | UraniumSqueeze | 10135 |
| 17 | JEENEETards | 9555 |
| 18 | BrieEnloeSnarkk | 8873 |
| 19 | Petroteq | 8238 |
plt.rcParams['figure.figsize'] = [30, 15]
plt.bar("subreddit", "num_of_times_rated_gold", data = no_10_df)
plt.xlabel("Subreddits (Reddit Topics)")
plt.xticks(rotation = 45)
plt.ylabel("Number of times rated as gold")
plt.title("Top 20 subreddits that have the highest number of times its comments received gold status")
## Save the plot in the plot dir so that it can be checked in into the repo
plot_fpath = os.path.join(PLOT_DIR, "no_10.png")
plt.savefig(plot_fpath)
plt.show()
## save the csv file in the csv dir
fpath = os.path.join(CSV_DIR, "no_10_df.csv")
no_10_df.to_csv(fpath)
Find the most frequently occured words in submission titles related to "gun violence" in the form of a Word Cloud plot. A word cloud is a collection, or cluster, of words depicted in different sizes. The bigger and bolder the word appears, the more often it’s mentioned within a given text.
Word Cloud plot for submission titles. First extract the title column from submissions dataset then conduct NLP analysis on it. Conduct plotting on the results. (NLP) (Word Cloud)
!pip install wordcloud
!pip install nltk
Requirement already satisfied: wordcloud in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/lib/python3.9/site-packages (1.8.2.2) Requirement already satisfied: matplotlib in /databricks/python3/lib/python3.9/site-packages (from wordcloud) (3.4.3) Requirement already satisfied: pillow in /databricks/python3/lib/python3.9/site-packages (from wordcloud) (8.4.0) Requirement already satisfied: numpy>=1.6.1 in /databricks/python3/lib/python3.9/site-packages (from wordcloud) (1.20.3) Requirement already satisfied: python-dateutil>=2.7 in /databricks/python3/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in /databricks/python3/lib/python3.9/site-packages (from matplotlib->wordcloud) (1.3.1) Requirement already satisfied: pyparsing>=2.2.1 in /databricks/python3/lib/python3.9/site-packages (from matplotlib->wordcloud) (3.0.4) Requirement already satisfied: cycler>=0.10 in /databricks/python3/lib/python3.9/site-packages (from matplotlib->wordcloud) (0.10.0) Requirement already satisfied: six in /databricks/python3/lib/python3.9/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.16.0) WARNING: You are using pip version 21.2.4; however, version 22.3.1 is available. You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/bin/python -m pip install --upgrade pip' command. Requirement already satisfied: nltk in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/lib/python3.9/site-packages (3.7) Requirement already satisfied: tqdm in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/lib/python3.9/site-packages (from nltk) (4.64.1) Requirement already satisfied: click in /databricks/python3/lib/python3.9/site-packages (from nltk) (8.0.3) Requirement already satisfied: regex>=2021.8.3 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/lib/python3.9/site-packages (from nltk) (2022.10.31) Requirement already satisfied: joblib in /databricks/python3/lib/python3.9/site-packages (from nltk) (1.0.1) WARNING: You are using pip version 21.2.4; however, version 22.3.1 is available. You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-5257efc3-8c9f-4480-8fc6-4c2d2207a4d5/bin/python -m pip install --upgrade pip' command.
gun_vio_df = (submissions
.withColumn("is_gun", col("title").rlike("(?i)^*gun violence$"))
.filter(col("is_gun") == "true")
.select(col("title"))
)
gun_vio_df = gun_vio_df.toPandas()
import nltk
from nltk.corpus import stopwords
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! Out[30]: True
# Text of all words in column title
text = " ".join(each for each in gun_vio_df.title.astype(str))
# Create stopword list using nltk's stopwords list
stops = set(stopwords.words('english'))
stops.update(["gun", "violence"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stops, background_color="white", width=800, height=400).generate(text)
# Display the generated image:
plt.axis("off")
plt.figure(figsize=(50,30))
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plot_fpath = os.path.join(PLOT_DIR, "no_11a_wordcloud.png")
plt.savefig(plot_fpath)
plt.show()
Similar to (a)
Find the most frequently occured words in comment bodies related to "Georgetown" in the form of a Word Cloud plot.
Word Cloud plot for comment bodies. First extract the body column from comments dataset then conduct NLP analysis on it. Conduct plotting on the results. (NLP) (Word Cloud)
gu_df = (comments
.withColumn("is_georgetown", col("body").rlike("(?i)^*georgetown$"))
.filter(col("is_georgetown") == "true")
.select(col("body"))
)
gu_df = gu_df.toPandas()
# Text of all words in column title
text = " ".join(each for each in gu_df.body.astype(str))
# Create stopword list using nltk's stopwords list
stops = set(stopwords.words('english'))
stops.update(["georgetown"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stops, background_color="white", width=800, height=400).generate(text)
# Display the generated image:
plt.axis("off")
plt.figure(figsize=(50,30))
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plot_fpath = os.path.join(PLOT_DIR, "no_11b_wordcloud.png")
plt.savefig(plot_fpath)
plt.show()
def categorizer(edited):
if edited != "false":
return "true"
bucket_udf = udf(categorizer, StringType())
comments_summary = (
comments.filter(col("edited").isNotNull())
.filter(col("retrieved_on").isNotNull())
.withColumn("edited", bucket_udf("edited"))
.withColumn("is_covid", col("body").rlike("(?i)^*covid$"))
.withColumn("retrieved_on_year", from_unixtime(col("retrieved_on"), "yyyy"))
.withColumn("retrieved_on_month", from_unixtime(col("retrieved_on"), "MM"))
.filter(col("retrieved_on_year") == 2022)
)
comments_summary_table = (
comments_summary.groupBy(["retrieved_on_year", "retrieved_on_month", "is_covid", "edited"])
.count()
.collect()
)
spark.createDataFrame(comments_summary_table).toPandas().sort_values("retrieved_on_month")
| retrieved_on_year | retrieved_on_month | is_covid | edited | count | |
|---|---|---|---|---|---|
| 25 | 2022 | 02 | True | None | 56843 |
| 24 | 2022 | 02 | False | true | 14773820 |
| 7 | 2022 | 02 | False | None | 577767806 |
| 11 | 2022 | 02 | True | true | 855 |
| 15 | 2022 | 03 | True | None | 23205 |
| 29 | 2022 | 03 | False | None | 340310810 |
| 28 | 2022 | 03 | False | true | 7659227 |
| 16 | 2022 | 03 | True | true | 401 |
| 30 | 2022 | 04 | False | None | 216805390 |
| 13 | 2022 | 04 | True | None | 21583 |
| 12 | 2022 | 04 | False | true | 5226084 |
| 31 | 2022 | 04 | True | true | 361 |
| 14 | 2022 | 05 | True | true | 481 |
| 27 | 2022 | 05 | False | true | 10009835 |
| 26 | 2022 | 05 | False | None | 355074209 |
| 10 | 2022 | 05 | True | None | 27865 |
| 21 | 2022 | 06 | False | true | 24207051 |
| 22 | 2022 | 06 | True | None | 56909 |
| 23 | 2022 | 06 | True | true | 879 |
| 4 | 2022 | 06 | False | None | 981584574 |
| 8 | 2022 | 07 | True | true | 122 |
| 17 | 2022 | 07 | False | true | 4859181 |
| 18 | 2022 | 07 | False | None | 208834853 |
| 3 | 2022 | 07 | True | None | 8382 |
| 1 | 2022 | 08 | False | true | 5390996 |
| 2 | 2022 | 08 | False | None | 235703544 |
| 0 | 2022 | 08 | True | None | 9976 |
| 6 | 2022 | 08 | True | true | 151 |
| 19 | 2022 | 09 | True | None | 8781 |
| 5 | 2022 | 09 | False | None | 238008957 |
| 20 | 2022 | 09 | False | true | 5407001 |
| 9 | 2022 | 09 | True | true | 122 |
submissions_summary = (
submissions.withColumn("is_gun", col("title").rlike("(?i)^*gun violence$"))
.filter(col("is_gun") == "true")
)
submissions_summary_table = (
submissions_summary.groupBy("title")
.agg(sum("num_comments").alias("total_comments"),
sum("score").alias("total_score"))
.orderBy(col("total_comments"), ascending=False)
)
submissions_summary_df = spark.createDataFrame(submissions_summary_table.take(10)).toPandas()
submissions_summary_df
| title | total_comments | total_score | |
|---|---|---|---|
| 0 | Students walk out of schools in protest of gun... | 4445 | 68912 |
| 1 | Had this fun little chat with my Dad about a m... | 2640 | 42746 |
| 2 | Liberals say they will allow provinces to ban ... | 2548 | 1533 |
| 3 | Ozzy Osbourne says he's leaving the US because... | 1901 | 11240 |
| 4 | DeSantis vetoes $35M earmarked for Rays facili... | 1506 | 8149 |
| 5 | David Hogg is kicked out of House Judiciary me... | 1456 | 7855 |
| 6 | A child protesting gun violence | 1314 | 5894 |
| 7 | How to stop gun violence | 1279 | 36312 |
| 8 | CMV: The New Assault Weapons Ban Will Harm the... | 1158 | 1804 |
| 9 | DeSantis blocks state money for Tampa Bay Rays... | 1095 | 6084 |
submissions_summary_df.title.values
Out[14]: array(['Students walk out of schools in protest of gun violence',
'Had this fun little chat with my Dad about a meme he sent me relating to gun violence',
'Liberals say they will allow provinces to ban handguns, citing rise in gun violence',
"Ozzy Osbourne says he's leaving the US because of gun violence",
'DeSantis vetoes $35M earmarked for Rays facility after team postures on gun violence',
'David Hogg is kicked out of House Judiciary meeting after calling out GOP complicity in gun violence',
'A child protesting gun violence', 'How to stop gun violence',
'CMV: The New Assault Weapons Ban Will Harm the Democrats Politically And Do Nothing Substantial to Reduce Gun Violence',
'DeSantis blocks state money for Tampa Bay Rays training facility after team tweets against gun violence'],
dtype=object)
submissions_summary_table_2 = (
submissions.groupBy(["subreddit", "over_18", "locked"])
.count()
.orderBy(col("count"), ascending=False)
.collect()
)
spark.createDataFrame(submissions_summary_table_2[:10]).toPandas()
| subreddit | over_18 | locked | count | |
|---|---|---|---|---|
| 0 | AskReddit | False | False | 6235768 |
| 1 | GaySnapchat | True | False | 3135659 |
| 2 | memes | False | False | 2663222 |
| 3 | teenagers | False | False | 2428932 |
| 4 | FreeKarma4U | False | False | 2405135 |
| 5 | dirtykikpals | True | True | 2393270 |
| 6 | FreeKarma4U | True | False | 2118677 |
| 7 | jerkbudss | True | False | 1856340 |
| 8 | onlyfansgirls101 | True | False | 1853240 |
| 9 | AutoNewspaper | False | False | 1733565 |
BitCoin Daily Closed Prices from Yahoo Finance during Jan. 1, 2022 to Aug. 31, 2022
Clcik here to Yahoo Fiance Page
The goal is to join BitCoin price data in 2022 with the submission dataset. And make a timeseries line plot to see how the BitCoin price changed compared to total the number of comments recieved in that day in 2022.
We will be using "created_utc" as the variable to extract date from.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
btc_df = pd.read_csv("data/csv/BTC-USD.csv")
date_and_comments = (submissions.withColumn("created_utc_date", from_unixtime(col("created_utc"), "yyyy-MM-dd"))
.filter(col("created_utc_date") >= "2022-01-01")
.groupBy("created_utc_date")
.agg(sum("num_comments").alias("sum_comments"))
.orderBy(col("created_utc_date"), ascending=True)
)
date_and_comments_df = spark.createDataFrame(date_and_comments.collect()).toPandas()
date_and_comments_df = date_and_comments_df.rename({"created_utc_date":"Date"}, axis=1)
joined_df = btc_df.merge(date_and_comments_df, on='Date', how='left')
joined_df = joined_df[["Date","Close","sum_comments"]]
joined_df.dropna(inplace=True)
import seaborn as sns
plt.rcParams['figure.figsize'] = [15, 10]
plt.subplot(2, 1, 1)
sns.lineplot(data=joined_df, x='Date', y='Close', color = "orange")
plt.xlabel("Jan. 1, 2022 to Aug. 31, 2022")
plt.ylabel("BTC Price in USD")
plt.title("BTC Price from Jan. 1, 2022 to Aug. 31, 2022")
plt.tick_params(axis='x', labelsize=0, length = 0)
plt.subplot(2, 1, 2)
sns.lineplot(data=joined_df, x='Date', y='sum_comments')
plt.xlabel("Jan. 1, 2022 to Aug. 31, 2022")
plt.ylabel("Number of Submission Comments")
plt.title("Number of Submission Comments from Jan. 1, 2022 to Aug. 31, 2022")
plt.tick_params(axis='x', labelsize=0, length = 0)
plt.show()
We can see from the plots that the BitCoin prices and the number of comments do not have a very strong relationship, either positive or negative. However, we can see from some very short time periods, the number of comments rised up dramatically when there's a very big drops in BitCoin price.